# @author Tansty
# @email tanstyztz@qq.com
# @description 爬取香港新闻网第1页的数据
import requests
import parsel
import re
import xlwt
import openpyxl
import time
def get_page(one_url):
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
    r = requests.get(one_url,headers=headers)
    body = r.content.decode('utf-8')
    sel = parsel.Selector(body)
    text = sel.css("#pressrelease").get()
    text1 = str(text).replace('<span id="pressrelease" class="fontSize1">','').strip()
    text1 = re.sub("<div.*\n.*</.*\n", "", text1).replace("<div>", "").replace("</span>", "").replace("</div>","").strip()
    text1 = re.sub("<d.*>", "", text1).strip().replace("<br>","")
    return text1
# def sava_data(list):
#     # 创建一个workbook 设置编码
#     workbook = xlwt.Workbook(encoding='utf-8')
#     # 创建一个worksheet
#     worksheet = workbook.add_sheet('NewsWorkSheet')
#     # 写入excel
#     # 参数对应 行, 列, 值
#     worksheet.write(0, 0, label='标题')
#     worksheet.write(0, 1, label='时间')
#     worksheet.write(0, 2, label='新闻内容')
#     worksheet.write(0, 3, label='链接')
#     for i in range(len(list)):
#         worksheet.write(i+1, 0, label=list[i]['title'])
#         worksheet.write(i+1, 1, label=list[i]['date'])
#         worksheet.write(i+1, 2, label=list[i]['page_content'])
#         worksheet.write(i+1, 3, label=list[i]['link'])
#     # 保存
#     workbook.save('news1.xls')
def sava_data(list):
    wb = openpyxl.load_workbook(r"C:\Users\ASUS\Desktop\background\spider\taobao\news2.xlsx")
    sheet0 = wb.create_sheet("data", index=0)
    # 写入excel
    # 参数对应 行, 列, 值
    sheet0.cell(1, 1, value='标题')
    sheet0.cell(1, 2, value='时间')
    sheet0.cell(1, 3, value='新闻内容')
    sheet0.cell(1, 4, value='链接')
    for i in range(len(list)):
        sheet0.cell(i+2, 1, value=list[i]['title'])
        sheet0.cell(i+2, 2, value=list[i]['date'])
        sheet0.cell(i+2, 3, value=list[i]['page_content'])
        sheet0.cell(i+2, 4, value=list[i]['link'])
    # 保存
    wb.save('news2.xls')
url = "https://www.search.gov.hk/result?ui_lang=zh-hk&proxystylesheet=ogcio_home_adv_frontend&output=xml_no_dtd&ui_charset=UTF-8&a_submit=false&query=%E7%B2%A4%E6%B8%AF%E6%BE%B3%E5%A4%A7%E6%B9%BE%E5%8C%BA&ie=UTF-8&oe=UTF-8&site=gia_home&tpl_id=stdsearch&gp=0&gp0=gia_home&gp1=gia_home&p_size=10&num=10&doc_type=all&as_filetype=&as_q=&as_epq=&is_epq=&as_oq=&is_oq=&as_eq=&is_eq=&r_lang=&lr=&web=this&sw=1&txtonly=0&rwd=0&date_v=within&date_last=20170701%2C20211007&s_date_year=2021&s_date_month=01&s_date_day=01&e_date_year=2021&e_date_month=10&e_date_day=07&last_mod=20170701%2C20211007&sort=date%3AD%3AL%3Ad1&page=1"
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
r = requests.get(url, headers=headers)
body = r.content.decode('utf-8')
sel = parsel.Selector(body)
tiles = sel.css('.itemDetailsTitle h3::text').getall()
links = sel.css('.itemDetailsTitle::attr(href)').getall()
dates = sel.css('.miscHolder>span::text').getall()
data_list = []
for i in range(0,len(tiles)):
    data = {}
    if(i==0):
        data['title'] = str(tiles[i]).strip() + str(tiles[i+1]).strip()
    if(i==1):
        continue
    else:
        data['title'] = str(tiles[i]).strip()
    data_list.append(data)
for i in range(0,len(links)):
    data_list[i]['link'] = str(links[i]).strip()
    data_list[i]['page_content'] = get_page(str(links[i]).strip())
    time.sleep(0.75)
for i in range(1, len(dates),2):
    data_list[i//2]['date'] = str(dates[i]).strip()
print(data_list)
sava_data(data_list)